Skill Gap Analysis

Code
import pandas as pd
df= pd.read_csv("data/eda_data.csv")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 2
      1 import pandas as pd
----> 2 df= pd.read_csv("data/eda_data.csv")

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'data/eda_data.csv'

Group 11 Skill

Code
import pandas as pd

skills_data = {
    "Name": ["Binderiya", "Pranjul", "Pratham", "Panyang"],
    "Python": [4, 4, 5, 3],
    "SQL": [4, 4, 5, 4],
    "Machine Learning": [2, 3, 2, 2],
    "PySpark": [3, 3, 3, 3],
    "Excel": [4, 5, 5, 4],
    "Data Visualization": [5, 5, 3, 3],
    "Power Bi/ Tableau": [4, 5, 3, 4],
    "Version Control Git": [4, 4, 3, 3],
    "ETL/Data pipeline": [3, 2, 1, 2],
    "Communication": [4, 4, 5, 3],
    "Project Management": [5, 5, 5, 3],
    "Cloud Computing": [4, 4, 2, 2]
}

df_skills = pd.DataFrame(skills_data)
df_skills.set_index("Name", inplace=True)
df_skills
Python SQL Machine Learning PySpark Excel Data Visualization Power Bi/ Tableau Version Control Git ETL/Data pipeline Communication Project Management Cloud Computing
Name
Binderiya 4 4 2 3 4 5 4 4 3 4 5 4
Pranjul 4 4 3 3 5 5 5 4 2 4 5 4
Pratham 5 5 2 3 5 3 3 3 1 5 5 2
Panyang 3 4 2 3 4 3 4 3 2 3 3 2
Code
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(7, 4))
sns.heatmap(df_skills, annot=True, cmap="YlGnBu", linewidths=0.5)
plt.title("Team Skill Levels Heatmap")
plt.show()

Code
import plotly.graph_objects as go
from IPython.display import IFrame
fig = go.Figure()

for name in df_skills.index:
    values = df_skills.loc[name].tolist()
    values += values[:1]  # close the loop
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=df_skills.columns.tolist() + [df_skills.columns[0]],
        fill='toself',
        name=name
    ))

fig.update_layout(
    polar=dict(radialaxis=dict(visible=True, range=[0, 5])),
    showlegend=True,
    title='Team Skills Radar Chart'
)
fig.write_html("figures/skills_radar_chart.html")
IFrame(src="figures/skills_radar_chart.html", width='100%', height=500)
fig.show()

From this radar chart visualization we can see that our team has a lot of room for improvement for skills like PySpark and Machine Learning. Also we can see that not a lot of our team mates are confident in their skills in Cloud Computing and ETL.

Top Skills

Code
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
            'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']

match = lambda col: df[col].str.contains('|'.join(keywords), case=False, na=False)

df['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') 
df['DATA_ANALYST_JOB'].value_counts()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 8
      1 keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
      2             'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
      3             'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
      4             'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
      6 match = lambda col: df[col].str.contains('|'.join(keywords), case=False, na=False)
----> 8 df['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
      9              | match('SKILLS_NAME') \
     10              | match('SPECIALIZED_SKILLS_NAME') 
     11 df['DATA_ANALYST_JOB'].value_counts()

Cell In[5], line 6, in <lambda>(col)
      1 keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
      2             'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
      3             'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
      4             'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
----> 6 match = lambda col: df[col].str.contains('|'.join(keywords), case=False, na=False)
      8 df['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
      9              | match('SKILLS_NAME') \
     10              | match('SPECIALIZED_SKILLS_NAME') 
     11 df['DATA_ANALYST_JOB'].value_counts()

NameError: name 'df' is not defined
Code
import ast
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

# Safely apply literal_eval only to non-null values
df['SKILLS'] = df['SKILLS_NAME'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])


data_skills = df[df['DATA_ANALYST_JOB']]['SKILLS'].explode().value_counts().reset_index()
data_skills.columns = ['Skill', 'Count']

fig = px.bar(data_skills, x='Skill', y='Count',
             title="Top Skills",
             labels={'Skill': 'Skill Name', 'Count': 'Frequency'},
             color='Skill')
df_skills.index = df_skills.index.str.strip()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 7
      4 import plotly.express as px
      6 # Safely apply literal_eval only to non-null values
----> 7 df['SKILLS'] = df['SKILLS_NAME'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
     10 data_skills = df[df['DATA_ANALYST_JOB']]['SKILLS'].explode().value_counts().reset_index()
     11 data_skills.columns = ['Skill', 'Count']

NameError: name 'df' is not defined
Code
from collections import defaultdict

# Lowercase everything
team_skills = [s.lower().strip() for s in df_skills.columns]
job_demand_raw = data_skills.copy()
job_demand_raw['Skill'] = job_demand_raw['Skill'].str.lower().str.strip()

# New dict to map cleaned team skill to total count from job postings
skill_demand_map = defaultdict(int)

for _, row in job_demand_raw.iterrows():
    skill_in_posting = row['Skill']
    count = row['Count']
    for team_skill in team_skills:
        if team_skill in skill_in_posting:
            skill_demand_map[team_skill] += count
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 5
      3 # Lowercase everything
      4 team_skills = [s.lower().strip() for s in df_skills.columns]
----> 5 job_demand_raw = data_skills.copy()
      6 job_demand_raw['Skill'] = job_demand_raw['Skill'].str.lower().str.strip()
      8 # New dict to map cleaned team skill to total count from job postings

NameError: name 'data_skills' is not defined
Code
team_skills = [s.strip().lower() for s in df_skills.columns]
print("Team skills:", team_skills)
print(job_demand_raw['Skill'].head(10).tolist())
for skill_text in job_demand_raw['Skill'].head(10):
    for team_skill in team_skills:
        if team_skill in skill_text:
            print(f" '{team_skill}' found in: '{skill_text}'")
Team skills: ['python', 'sql', 'machine learning', 'pyspark', 'excel', 'data visualization', 'power bi/ tableau', 'version control git', 'etl/data pipeline', 'communication', 'project management', 'cloud computing']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 3
      1 team_skills = [s.strip().lower() for s in df_skills.columns]
      2 print("Team skills:", team_skills)
----> 3 print(job_demand_raw['Skill'].head(10).tolist())
      4 for skill_text in job_demand_raw['Skill'].head(10):
      5     for team_skill in team_skills:

NameError: name 'job_demand_raw' is not defined
Code
for _, row in job_demand_raw.iterrows():
    skill_text = row['Skill']
    count = row['Count']
    for team_skill in team_skills:
        if team_skill in skill_text:  # no regex, just substring
            skill_demand_map[team_skill] += count

job_demand = pd.Series(skill_demand_map)
print(job_demand)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 for _, row in job_demand_raw.iterrows():
      2     skill_text = row['Skill']
      3     count = row['Count']

NameError: name 'job_demand_raw' is not defined
Code
job_demand = pd.Series(skill_demand_map)
job_demand.name = "Count"
team_avg = df_skills.mean()
team_avg.index = team_avg.index.str.strip().str.lower() 
# Now match only overlapping skills
common_skills = job_demand.index.intersection(team_avg.index)
team_avg = team_avg[common_skills]
job_demand = job_demand[common_skills]

# Normalize job demand
job_demand_normalized = 5 * (job_demand / job_demand.max())
job_demand_normalized.name = "Job Demand (Normalized)"

# Combine
comparison_df = pd.concat([team_avg, job_demand_normalized], axis=1)
comparison_df.columns = ["Team Average Skill", "Job Demand (Normalized)"]
comparison_df["Skill Gap"] = comparison_df["Job Demand (Normalized)"] - comparison_df["Team Average Skill"]
comparison_df.sort_values("Skill Gap", ascending=False, inplace=True)

comparison_df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 job_demand = pd.Series(skill_demand_map)
      2 job_demand.name = "Count"
      3 team_avg = df_skills.mean()

NameError: name 'skill_demand_map' is not defined
Code
comparison_df = comparison_df.reset_index().rename(columns={"index": "Skill"})
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 comparison_df = comparison_df.reset_index().rename(columns={"index": "Skill"})

NameError: name 'comparison_df' is not defined
Code
import plotly.express as px

fig = px.bar(
    comparison_df,
    x='Skill',
    y='Skill Gap',
    color='Skill Gap',
    color_continuous_scale='RdBu_r',
    title='Skill Gaps: Job Market Expectations vs. Team Capability',
    labels={'Skill Gap': 'Gap (Job Demand - Team Skill)', 'Skill': 'Skill'},
)

fig.add_hline(y=0, line_dash='dash')
fig.update_layout(
    xaxis_tickangle=-45,
    yaxis_title='Gap (Positive = Market expects more)',
    font=dict(size=13),
    height=500,
    plot_bgcolor='white',
)
fig.write_html("figures/skill_gap_chart.html")
fig.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 4
      1 import plotly.express as px
      3 fig = px.bar(
----> 4     comparison_df,
      5     x='Skill',
      6     y='Skill Gap',
      7     color='Skill Gap',
      8     color_continuous_scale='RdBu_r',
      9     title='Skill Gaps: Job Market Expectations vs. Team Capability',
     10     labels={'Skill Gap': 'Gap (Job Demand - Team Skill)', 'Skill': 'Skill'},
     11 )
     13 fig.add_hline(y=0, line_dash='dash')
     14 fig.update_layout(
     15     xaxis_tickangle=-45,
     16     yaxis_title='Gap (Positive = Market expects more)',
   (...)
     19     plot_bgcolor='white',
     20 )

NameError: name 'comparison_df' is not defined

This bar chart compares our team’s average proficiency in key data-related skills against job market expectations. Skills with positive values (like communication and SQL) indicate areas where market demand exceeds our current capabilities. On the other hand, negative values highlight areas where the team is ahead or closely aligned with market needs. Notably, skills like Python, cloud computing, and project management show the largest gaps, suggesting priority areas for upskilling.